getwd()
## [1] "/Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks"

Load libraries and source scripts

source("../references/biostats.R")

list.of.packages <- c("DESeq2", "RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Inspect total counts by sample for larval samples

load(file = "../results/gene-counts-filtered") #object = counts.filtered

ggplotly(
ggplot(data = data.frame(colSums(counts.filtered)) %>% 
                  dplyr::rename(count.total = 1) %>% 
                  rownames_to_column(var="sample") %>% 
                  filter(grepl("_larvae", sample))) +
           geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample, Larvae") + 
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())) 

### Load transformed counts object

load(file = "../results/gene-counts-trans") #object = counts.t

Extract larval samples only

counts.t.larvae <- as.data.frame(counts.t[grepl(".larvae", rownames(counts.t)), ])
counts.t.larvae[,1:3] #take a peak at the resulting dataframe 
##             OLUR_00020575 OLUR_00006628 OLUR_00032161
## 39_larvae               3            65           238
## 401_larvae              1            69           263
## 402_larvae              2            58           218
## 403_larvae              0            72           420
## 404_larvae              0            52           929
## 411_larvae              1           101           495
## 413_larvae              1            47           593
## 414_larvae              0            67            24
## 421_larvae              0            78           190
## 43_larvae               3            91           170
## 431b_larvae             0            97           687
## 434_larvae              0            18           430
## 442b_larvae             0            50           355
## 443_larvae              1            21           861
## 444_larvae              0           242          1573
## 445_larvae              0            97           444
## 451_larvae              1           147          1042
## 452b_larvae             0            95           589
## 453_larvae              4            60           182
## 461b_larvae             0            86           513
## 47_larvae               0           130           253
## 471b_larvae             0            99           576
## 472b_larvae             1           211           530
## 473_larvae              1           366           878
## 474_larvae              0           182           920
## 475_larvae              0           314          1270
## 476_larvae              0           193          1086
## 477_larvae              0           106           574
## 481_larvae              1            41           185
## 482_larvae              0            57           521
## 483_larvae              0            38           146
## 484_larvae              3            39           196
## 485_larvae              1           182           393
## 487_larvae              2            94           323
## 488_larvae              0           176          1156
## 489_larvae              0            98           485
## 490_larvae              1            76           324
## 491_larvae              0            57           425
## 492_larvae              3            25           371
## 506_larvae              0             9           294
## 513_larvae              0            68           112
## 522_larvae              0            91           234
## 523_larvae              1           123           934
## 524_larvae              0            57           998
## 525_larvae              0            64           184
## 526_larvae              2           119           304
## 527_larvae              1            51           107
## 528_larvae              0           165           740
## 529_larvae              0            89           645
## 531_larvae              3           106           280
## 532_larvae              1            50           262
## 533_larvae              0            47           389
## 541_larvae              0            71           660
## 542_larvae              0            20           155
## 543_larvae              1            85           419
## 551_larvae              0           257           966
## 552b_larvae             0            80           260
## 553_larvae              1           127           308
## 554_larvae              3           280           531
## 563_larvae              2            57           975
## 564_larvae              0           124           474

Optional

Pre-filtering - remove rows (genes) with less than a total of 10 reads (across all samples) (why?)

keep <- colSums(counts.t.larvae) >= 10
counts.ts.larvae <- counts.t.larvae[,keep]
print(paste("# genes remaining after pre-filtering:", ncol(counts.ts.larvae)))
## [1] "# genes remaining after pre-filtering: 30443"
print(paste("# of genes dropped:", ncol(counts.t.larvae) - ncol(counts.ts.larvae), sep=" "))
## [1] "# of genes dropped: 1767"

How many genes were identified in each sample?

ggplotly(
ggplot(data = data.frame(rowSums(counts.t.larvae != 0)) %>% 
                  dplyr::rename(count.total = 1) %>% 
                  rownames_to_column(var="sample") %>% 
                  filter(grepl("_larvae", sample))) +
           geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total # genes by sample, larvae, pooled") + 
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())) 

Use foa.plots to visualize data a bit:

In how many samples does each gene occur?

  • The first four plots portray the gene’ frequency of occurrence among samples in a number of different ways – either as an empirical cumulative distribution function (ECDF) of gene occurrence or as a histogram of gene occurrence.

What is the mean abundance of each gene when it occurs (not averaging zeros for samples where it is absent)?

  • The fifth plot is an ECDF of gene mean abundance. X-axis is samples, ranked from 1-n in terms of mean gene abundance.

Is the mean abundance of genes correlated with the number of samples they occur in?

  • The sixth plot is a scatter plot of frequency of occurrence against mean abundance. Is there any apparent relationship between the two? Are the widespread genes also generally more abundant? Are there many widespread genes that occur at low abundance? Conversely, are there genes much less widespread, but abundant where they occur?

Is the total abundance of gene in a sample correlated with the number of gene in a sample? To answer this question, first it is instructive to look at the number of gene per sample.

  • The eighth plot depicts the ECDF of sample richness. Are there any interesting patterns? For example, do most samples support an average number of gene, while only a few samples supporting either very few or very many gene? Or is the pattern different?

Second, what is the pattern in the distribution of sample total abundance?

  • The ninth plot is the ECDF of total sample abundance. How does it compare to the ECDF of sample richness?

Finally, to answer the question on the relation between total abundance and number of gene/sample …

  • The last plot is a scatter plot of the two variables. Is there is relationship between the number of genes per sample and the total abundance? Do gene-rich samples generally have a greater total abundance of those genes as well?
# note: you'll need to press return in the console for all plots 
#foa.plots(counts.ts.larvae)

Merge sample key info to count data, then sort, and generate heat map for initial inspection by treatment

# merge count data with sample key, reset row names as sample names, and arrange by infection, then temperature, then day 
load(file="../raw-data/key")

counts.tsk.larvae <- merge(x=key, by.x="sample_stage", y=counts.ts.larvae, by.y="row.names") %>% 
  arrange(stage, population, pCO2.parent)  %>% column_to_rownames(var="sample_stage") 

head(counts.tsk.larvae[1:24]) #check out results of merge/arrange
##            lane sample RNA.conc RNA.ng endpoint.RFU pcr.cycles prep.batch
## 401_larvae    2    401       99    350          127         16          5
## 402_larvae    2    402        6    350           88         16          4
## 403_larvae    2    403       12    350           68         14          5
## 404_larvae    2    404        5    350           31         15          4
## 421_larvae    2    421       60    288           24         14          4
## 411_larvae    2    411       77    350          112         16          5
##                  species  stage             tissue DNA.ng bioan.mean.bp
## 401_larvae Ostrea lurida larvae whole body, pooled   1.24            26
## 402_larvae Ostrea lurida larvae whole body, pooled   2.76            27
## 403_larvae Ostrea lurida larvae whole body, pooled   4.16            28
## 404_larvae Ostrea lurida larvae whole body, pooled   2.92            28
## 421_larvae Ostrea lurida larvae whole body, pooled   1.63            28
## 411_larvae Ostrea lurida larvae whole body, pooled   3.08            18
##            index.no index.seq population temp.parent pCO2.parent larval.sample
## 401_larvae     7012    ATGAAC  Dabob Bay          10     Ambient          14-A
## 402_larvae     7006    GTGTAG  Dabob Bay          10     Ambient          31-A
## 403_larvae     7032    CGAAGG  Dabob Bay          10     Ambient          75-A
## 404_larvae     7046    CTCCAT  Dabob Bay          10     Ambient          80-A
## 421_larvae     7024    CCGCAA  Dabob Bay           6     Ambient          59-A
## 411_larvae     7011    TTAACT  Dabob Bay          10        High          23-A
##            filename OLUR_00020575 OLUR_00006628 OLUR_00032161 OLUR_00011450
## 401_larvae      401             1            69           263            17
## 402_larvae      402             2            58           218            12
## 403_larvae      403             0            72           420            78
## 404_larvae      404             0            52           929            57
## 421_larvae      421             0            78           190            26
## 411_larvae      411             1           101           495            21
##            OLUR_00018391
## 401_larvae             0
## 402_larvae             2
## 403_larvae             3
## 404_larvae             2
## 421_larvae             2
## 411_larvae             5
#counts.tsk.larvae %>% dplyr::select(starts_with("OLUR")) #this is code to get only the gene columns 

Generate heat map of counts before DESeq processing / analysis

NOTE: scale=“column” b/c range of counts is so huge, so counts have been scaled

pheatmap(data.matrix(counts.tsk.larvae %>% dplyr::select(starts_with("OLUR"))), Rowv=NA, Colv=NA, na.rm = TRUE, xlab = NA, 
                     show_colnames =FALSE, cluster_cols = FALSE, cluster_rows = TRUE, 
                     scale="column", color=c("dodgerblue3", "goldenrod1"), 
                     main = "Oly Larvae Gene Counts", annotation_row=counts.tsk.larvae[,c("population", "pCO2.parent")],
         filename = "../results/heatmap-larval-counts.png")

Resulting heat map is located in results/ directory: heatmap-larvae-counts.png

Analysis in DESeq2

Reformat for DESeq, ensure correct sample order for

NOTE: It is absolutely critical that the columns of the count matrix and the rows of the column data (information about samples) are in the same order. DESeq2 will not make guesses as to which column of the count matrix belongs to which row of the column data, these must be provided to DESeq2 already in consistent order.

all(rownames(counts.tsk.larvae) == counts.tsk.larvae %>% dplyr::select(starts_with("OLUR")) %>% t() %>% colnames()) #check that rownames of untransformed matrix match column names of transformed matrix. Should print 'TRUE' 
## [1] TRUE

Generate DESeq datasets with various treatment comparisons

#counts.DESeq <- counts.tsk.larvae[-which(rownames(counts.tsk.larvae) %in% "571_larvae"), grepl("OLUR", colnames(counts.tsk.larvae))] %>% t()
#key.DESeq <- counts.tsk.larvae[-which(rownames(counts.tsk.larvae) %in% "571_larvae"),c("population", "pCO2.parent")] 

dds.larvae <- DESeqDataSetFromMatrix(countData = counts.tsk.larvae[,grepl("OLUR", colnames(counts.tsk.larvae))] %>% t(),
                              colData = counts.tsk.larvae[,c("population", "pCO2.parent")] ,
                              design = ~ population + pCO2.parent)
## factor levels were dropped which had no samples
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]

Visualize data via PCAs and heat maps

Transform data

  • Here we transform counts using a variance stabilizing transformation (VST), since the rlog transformation threw an error and suggested using VST.
  • Here we use blind=FALSE b/c we are interested in differences explained by experimental design, and may wish to use this transformed data in downstream analyses.
vsd.larvae <- varianceStabilizingTransformation(dds.larvae, blind=FALSE)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]

Visualize sample clustering via PCA (after transformation)

NOTE: Hover over points to see the sample numbers

# PCA with points color coded by population 
#ggplotly(
plotPCA(vsd.larvae, intgroup="population") + 
           ggtitle("PCA by population (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text

# PCA with points color coded by parental pCO2 exposure 
#ggplotly(
plotPCA(vsd.larvae, intgroup="pCO2.parent") + 
           ggtitle("PCA by parental pCO2 exposure (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text

# PCA with points color coded by tissue and pco2 factors 
#ggplotly(
plotPCA(vsd.larvae, intgroup=c("population","pCO2.parent")) + 
           ggtitle("PCA by population + parental pCO2 (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text

Generate heat maps before & after transformation

# extract treatment info from VSD transformation 
#vsd.larvae.df <- as.data.frame(colData(vsd.larvae)[,c("population", "pCO2.parent")])

# generate heatmap from untransformed counts 
#pheatmap(counts(dds.larvae), cluster_rows=FALSE, show_rownames=FALSE,
         #cluster_cols=T, annotation_col=vsd.larvae.df, scale = "row", main="QuantSeq, untransformed data (but scaled by rows")

# generate heatmap from VSD counts 
#pheatmap(assay(vsd.larvae), cluster_rows=FALSE, show_rownames=FALSE,
         #cluster_cols=T, annotation_col=vsd.larvae.df, main = "QuantSeq, VSD-transformed")

Heatmap of the sample-to-sample distances

Another use of the transformed data is sample clustering. Here, we apply the dist function to the transpose of the transformed count matrix to get sample-to-sample distances.

A heatmap of this distance matrix gives us an overview over similarities and dissimilarities between samples. We have to provide a hierarchical clustering hc to the heatmap function based on the sample distances, or else the heatmap function would calculate a clustering based on the distances between the rows/columns of the distance matrix.

colors <- colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)

sampleDists <- dist(t(assay(vsd.larvae)))
sampleDistMatrix <- as.matrix(sampleDists)

# Here we show pCO2.parent + population 
rownames(sampleDistMatrix) <- paste(vsd.larvae$population, vsd.larvae$pCO2.parent, sep="-") #set row names 
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=rev(colors), fontsize = 6)

Differential Expression Analysis - multifactor design

Run the function DESeq to assess differential expression

dds.larvae.DESeq <- DESeq(dds.larvae) 
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]

Larvae: Differential Gene Expression Analysis by parental pCO2 exposure (all populations combined)

print("Comparison: parental pCO2 - All populations")
## [1] "Comparison: parental pCO2 - All populations"
summary(res.larvae.pco2 <- results(dds.larvae.DESeq, contrast=c("pCO2.parent", "Ambient", "High"), alpha=0.05)) #not with p=0.05, but checking p=0.1 just because
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 0, 0%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pCO2, larvae, comparison across all pops:",  sum(res.larvae.pco2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pCO2, larvae, comparison across all pops: 0"
# nope

Larvae: Differential Gene Expression Analysis by population (all parental pCO2 exposure, combined)

# Here are all the possible contrasts I can make 
levels(dds.larvae.DESeq$population)
## [1] "Dabob Bay"     "Fidalgo Bay"   "Oyster Bay C1" "Oyster Bay C2"
print("Comparison: Dabob Bay vs. Fidalgo Bay")
## [1] "Comparison: Dabob Bay vs. Fidalgo Bay"
summary(res.larvae.DBFB <- results(dds.larvae.DESeq, contrast=c("population", "Dabob Bay", "Fidalgo Bay"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 188, 0.62%
## LFC < 0 (down)     : 127, 0.42%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 5312, 17%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Dabob Bay vs. Oyster Bay C1")
## [1] "Comparison: Dabob Bay vs. Oyster Bay C1"
summary(res.larvae.DBOB1 <- results(dds.larvae.DESeq, contrast=c("population", "Dabob Bay", "Oyster Bay C1"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 143, 0.47%
## LFC < 0 (down)     : 125, 0.41%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 1771, 5.8%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Dabob Bay vs. Oyster Bay C2")
## [1] "Comparison: Dabob Bay vs. Oyster Bay C2"
summary(res.larvae.DBOB2 <- results(dds.larvae.DESeq, contrast=c("population", "Dabob Bay", "Oyster Bay C2"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 100, 0.33%
## LFC < 0 (down)     : 63, 0.21%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 3542, 12%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Fidalgo Bay vs. Oyster Bay C1")
## [1] "Comparison: Fidalgo Bay vs. Oyster Bay C1"
summary(res.larvae.FBOB1 <- results(dds.larvae.DESeq, contrast=c("population", "Fidalgo Bay", "Oyster Bay C1"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 247, 0.81%
## LFC < 0 (down)     : 414, 1.4%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 5312, 17%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Fidalgo Bay vs. Oyster Bay C2")
## [1] "Comparison: Fidalgo Bay vs. Oyster Bay C2"
summary(res.larvae.FBOB2 <- results(dds.larvae.DESeq, contrast=c("population", "Fidalgo Bay", "Oyster Bay C2"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 126, 0.41%
## LFC < 0 (down)     : 128, 0.42%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 4722, 16%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Oyster Bay C1 vs. Oyster Bay C2")
## [1] "Comparison: Oyster Bay C1 vs. Oyster Bay C2"
summary(res.larvae.OB1OB2 <- results(dds.larvae.DESeq, contrast=c("population", "Oyster Bay C1", "Oyster Bay C2"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 0, 0%
## outliers [1]       : 1, 0.0033%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Count # of genes diff expressed (p-value <0.05) in each population comparison

paste("No. of genes differentially expressed (padj<0.05) between Dabob & Fidalgo larvae:",  sum(res.larvae.DBFB$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Dabob & Fidalgo larvae: 315"
paste("No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C1 larvae:",  sum(res.larvae.DBOB1$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C1 larvae: 268"
paste("No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C2 larvae:",  sum(res.larvae.DBOB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C2 larvae: 163"
paste("No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C1 larvae:",  sum(res.larvae.FBOB1$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C1 larvae: 661"
paste("No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C2 larvae:",  sum(res.larvae.FBOB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C2 larvae: 254"
paste("No. of genes differentially expressed (padj<0.05) between Oyster Bay C1 & C2 larvae:",  sum(res.larvae.OB1OB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Oyster Bay C1 & C2 larvae: 0"

Save population vs. population DEG lists for later comparisons

diffex.larvae.DBFB <- subset(res.larvae.DBFB, padj < 0.05)
diffex.larvae.DBOB1 <- subset(res.larvae.DBOB1, padj < 0.05)
diffex.larvae.DBOB2 <- subset(res.larvae.DBOB2, padj < 0.05)
diffex.larvae.FBOB1 <- subset(res.larvae.FBOB1, padj < 0.05)
diffex.larvae.FBOB2 <- subset(res.larvae.FBOB2, padj < 0.05)
# note: no differences between OB1 and OB2

# save all R objects to file for integration 
save(diffex.larvae.DBFB, file="../results/diffex.larvae.DBFB")
save(diffex.larvae.DBOB1, file="../results/diffex.larvae.DBOB1")
save(diffex.larvae.DBOB2, file="../results/diffex.larvae.DBOB2")
save(diffex.larvae.FBOB1, file="../results/diffex.larvae.FBOB1")
save(diffex.larvae.FBOB2, file="../results/diffex.larvae.FBOB2")

# merge and select unique genes for PCA 
degs.larvae.pops <- c(rownames(diffex.larvae.DBFB), 
  rownames(diffex.larvae.DBOB1),
  rownames(diffex.larvae.DBOB2),
  rownames(diffex.larvae.FBOB1),
  rownames(diffex.larvae.FBOB2)) %>% unique()
save(degs.larvae.pops, file="../results/degs.larvae.pops")

Generate PCA with only differentially expressed genes among population

This PCA represents the physiological differences among populations/cohorts of Olympia oyster larvae, upon maternal liberation.

# PCA with points color coded by tissue and pco2 factors 
#ggplotly(
  plotPCA(vsd.larvae[degs.larvae.pops,], intgroup=c("population")) + 
           ggtitle("Larval PCA by population, DEGs (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text

# generate heatmap of population DEGs  
vsd.larvae.df <- as.data.frame(colData(vsd.larvae)[,c("population", "pCO2.parent")])

pheatmap(assay(vsd.larvae[degs.larvae.pops,]), cluster_rows=T, show_rownames=FALSE, show_colnames=FALSE,
         cluster_cols=T, annotation_col=vsd.larvae.df[1], scale="row", main = "DEGs among populations")

Larvae: Differential Gene Expression Analysis by Parental pCO2, within each population

# Add a factor that defines interaction group 
dds.larvae.DESeq$group <- factor(paste0(dds.larvae.DESeq$population, dds.larvae.DESeq$pCO2.parent))
design(dds.larvae.DESeq) <- ~ group 
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
dds.larvae.DESeq <- DESeq(dds.larvae.DESeq)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## -- replacing outliers and refitting for 4 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]

Extract differential expression results / comparisons among different factors

Create results objects, but summary of results are also shown

NOTE: can only compare two treatments at a time

# Here are all the possible contrasts I can make 
levels(dds.larvae.DESeq$group)
## [1] "Dabob BayAmbient"     "Dabob BayHigh"        "Fidalgo BayAmbient"  
## [4] "Fidalgo BayHigh"      "Oyster Bay C1Ambient" "Oyster Bay C1High"   
## [7] "Oyster Bay C2Ambient" "Oyster Bay C2High"
print("Comparison: parental pCO2 - Fidalgo Bay")
## [1] "Comparison: parental pCO2 - Fidalgo Bay"
summary(res.larvae.FB <- results(dds.larvae.DESeq, contrast=c("group", "Fidalgo BayHigh", "Fidalgo BayAmbient"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 1, 0.0033%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: parental pCO2 - Dabob Bay")
## [1] "Comparison: parental pCO2 - Dabob Bay"
summary(res.larvae.DB <- results(dds.larvae.DESeq, contrast=c("group", "Dabob BayHigh", "Dabob BayAmbient"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 0, 0%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: parental pCO2 - Oyster Bay Cohort 1")
## [1] "Comparison: parental pCO2 - Oyster Bay Cohort 1"
summary(res.larvae.OB1 <- results(dds.larvae.DESeq, contrast=c("group", "Oyster Bay C1High", "Oyster Bay C1Ambient"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 0, 0%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: parental pCO2 - Oyster Bay Cohort 2")
## [1] "Comparison: parental pCO2 - Oyster Bay Cohort 2"
summary(res.larvae.OB2 <- results(dds.larvae.DESeq, contrast=c("group", "Oyster Bay C2High", "Oyster Bay C2Ambient"), alpha=0.05))
## 
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 0, 0%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Count # of genes diff expressed (p-value <0.05) in each comparison

paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Fidalgo Bay larvae:",  sum(res.larvae.FB$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Fidalgo Bay larvae: 1"
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Dabob Bay larvae:",  sum(res.larvae.DB$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Dabob Bay larvae: 0"
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort1 larvae:",  sum(res.larvae.OB1$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort1 larvae: 0"
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort2 larvae:",  sum(res.larvae.OB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort2 larvae: 0"

Extract stats for significantly different genes expressed in FB & DB cohort

diffex.larvae.FB <- subset(res.larvae.FB, padj < 0.05)
#diffex.larvae.DB <- subset(res.larvae.DB, padj < 0.05)

Extract counts for differentially expressed genes for FB & DB cohort

diffex.larvae.FB.counts <- subset(counts(dds.larvae.DESeq), rownames(dds.larvae.DESeq) %in% rownames(diffex.larvae.FB)) 
diffex.larvae.FB.counts <- diffex.larvae.FB.counts[,subset(key, stage=="larvae" & population=="Fidalgo Bay")$sample_stage]

Plot gene counts for DEGs

It can also be useful to examine the counts of reads for a single gene across the groups. A simple function for making this plot is plotCounts, which normalizes counts by sequencing depth and adds a pseudocount of 1/2 to allow for log scale plotting. The counts are grouped by the variables in intgroup, where more than one variable can be specified. Here we specify the gene which had the smallest p value from the results table created above. You can select the gene to plot by rowname or by numeric index.

Here we plot the genes differentially expressed in FB between parental pCO2 exposure

a <-  plotCounts(dds.larvae.DESeq, gene=rownames(diffex.larvae.FB), intgroup=c("population", "pCO2.parent"), returnData = TRUE)

ggplot(subset(a, population=="Fidalgo Bay") %>% rownames_to_column("sample"),
       aes(x=pCO2.parent, y=count, color=pCO2.parent, label=sample)) +
  geom_point(position=position_jitter(w = 0.1,h = 0)) +
    geom_text() +
    theme_bw() +
    ggtitle(rownames(diffex.larvae.FB)) +
    theme(plot.title = element_text(hjust = 0.5))

======

BONEYARD - not necessary due to low DEGs

generate heatmap with differentially expressed genes within FB

# dds.larvae.df <- as.data.frame(colData(dds.larvae)[as.data.frame(colData(dds.larvae))$population==
#                                                                    "Fidalgo Bay", c("population", "pCO2.parent")])
# 
# pheatmap(diffex.larvae.FB.counts[,order(colnames(diffex.larvae.FB.counts))], 
#          cluster_rows=T, show_rownames=FALSE, cluster_columns=T, na.rm=TRUE, scale="row", main = "FB Gene Counts, all differentially expressed genes among pCO2", annotation_col=dds.larvae.df.FB[ order(row.names(dds.larvae.df.FB)), ][2], color=c("dodgerblue3", "goldenrod1"))

generate heatmap with differentially expressed genes within DB

# dds.larvae.df.DB <- as.data.frame(colData(dds.larvae)[as.data.frame(colData(dds.larvae))$population==
#                                                                    "Dabob Bay", c("population", "pCO2.parent")])
# 
# pheatmap(diffex.larvae.DB.counts[,order(colnames(diffex.larvae.DB.counts))], 
#          cluster_rows=T, show_rownames=FALSE, cluster_columns=T, na.rm=TRUE, scale="row", main = "FB Gene Counts, all differentially expressed genes among pCO2", annotation_col=dds.larvae.df.DB[ order(row.names(dds.larvae.df.DB)), ][2], color=c("dodgerblue3", "goldenrod1"))

Barplots for lots of DEGs

# larvae.FB.p05.names <-rownames(diffex.larvae.FB[order(diffex.larvae.FB$padj),])
# count_plots = list()
# #plot the 10 genes with lowest p-values
# for (i in 1:10) {
# a <-  plotCounts(dds.larvae.DESeq, gene=larvae.FB.p05.names[i], intgroup=c("population", "pCO2.parent"), returnData = TRUE)
# b <- ggplot(subset(a, population=="Fidalgo Bay") %>% rownames_to_column("sample"),
#        aes(x=pCO2.parent, y=count, color=pCO2.parent, label=sample)) +
#   geom_point(position=position_jitter(w = 0.1,h = 0)) +
#     geom_text() +
#     theme_bw() +
#     ggtitle(larvae.FB.p05.names[i]) +
#     theme(plot.title = element_text(hjust = 0.5))
# count_plots[[i]] <- b
# }
# count_plots

Here we plot the genes differentially expressed in DB between parental pCO2 exposure

# larvae.DB.p05.names <-rownames(diffex.larvae.DB[order(diffex.larvae.DB$padj),])
# count_plots = list()
# #plot the 10 genes with lowest p-values
# for (i in 1:10) {
# a <-  plotCounts(dds.larvae.DESeq, gene=larvae.DB.p05.names[i], intgroup=c("population", "pCO2.parent"), returnData = TRUE)
# b <- ggplot(subset(a, population=="Dabob Bay") %>% rownames_to_column("sample"),
#        aes(x=pCO2.parent, y=count, color=pCO2.parent, label=sample)) +
#   geom_point(position=position_jitter(w = 0.1,h = 0)) +
#     geom_text() +
#     theme_bw() +
#     ggtitle(larvae.DB.p05.names[i]) +
#     theme(plot.title = element_text(hjust = 0.5))
# count_plots[[i]] <- b
# }
# count_plots

MORE BONEYARD, OR TBD ANALYSIS DEVELOPED DURING CRAB ANALYSIS / PILOT STUDY

Create master dataframe of all differentially expressed genes across all comparisons

# diffex.all.counts <- 
#   rbind.data.frame(
#     diffex.status.counts,
#     diffex.ColdVSWarm.counts,
#     diffex.AmbVSWarm.counts,
#     diffex.ColdVSAmb.counts,
#     diffex.9vs26.counts,
#     diffex.9vs12.counts,
#     diffex.12vs26.counts) %>% 
#   rownames_to_column("gene") %>%
#     arrange(gene)
#   
# # are there any duplicate genes? no. 
# diffex.all.counts[duplicated(diffex.all.counts), ]
# 
# # Move first column with gene names to row names 
# diffex.all.counts <- diffex.all.counts %>% 
#   column_to_rownames("gene")

Of all differentially expressed genes (all comparisons), which are the most influential of sample ordination in multivariate space?

Run PCA on list of differentially expressed genes (across all comparisons)

NOTE: set scale=false to use a variance-covariance matrix, putting more weight on genes with higher counts.

Notes from multivariate class notes: PCA is sensitive to the scale of measurement of the data. If all the data are not measured on the same scale, using covariance means that the result will be determined mostly by the variable with the largest values, as it will have the highest variance. Using a correlation matrix treats all variables the same (standardized to mean=0 and std. dev.=1). In prcomp(), this means specifying scale=TRUE in the function call.

Perform PCA

#diff.pca<-FactoMineR::PCA(t(diffex.all.counts),graph=F) #note: need to transform count frame for this PCA 

Scree plot shows how much variance is explained by each Principal Component (PC) Axis:

  • PC Axis 1 explains 31.3% of variance
  • PC Axis 2 explains 10.9% of variance
  • PC Axis 3 explains 9.3% of variance
#fviz_screeplot(diff.pca, addlabels = TRUE)

Look at the relative contributions of each SAMPLE to PC axes 1, 2, and 3

# fviz_contrib(diff.pca, choice = "ind", axes = 1) + ggtitle("Contribution of samples to PC dimension #1")
# fviz_contrib(diff.pca, choice = "ind", axes = 2) + ggtitle("Contribution of samples to PC dimension #2")
# fviz_contrib(diff.pca, choice = "ind", axes = 3) + ggtitle("Contribution of samples to PC dimension #3")

Plot PC scores for axis 1 ~ axis 2, by treatment

# pca.key <- key[order(match(key$sample_stage, rownames(diff.pca$ind$coord))), ] #create key with samples ordered by same order as they are in the PCA 
# 
# # PCA plots with samples, color coded by treatments 
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$infection_status, title = "PC1 ~ PC2, color=infection status") 
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$temperature, title = "PC1 ~ PC2, color=temperature") 
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind"), invisible = "var", col.ind = pca.key$day, title = "PC1 ~ PC2, color=day sampled")
# 
# # PCA with samples + top 10 genes contributing to PC scores 
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind", "var"), select.var = list(contrib = 10)) 

Plot PC scores for axis 1 ~ axis 3, by treatment

# pca.key <- key[order(match(key$sample_stage, rownames(diff.pca$ind$coord))), ] #create key with samples ordered by same order as they are in the PCA 
# 
# # PCA plots with samples, color coded by treatments 
# fviz_pca_biplot(diff.pca, axes = c(1,3), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$infection_status, title = "PC1 ~ PC3, color=infection status") 
# fviz_pca_biplot(diff.pca, axes = c(1,3),repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$temperature, title = "PC1 ~ PC3, color=temperature") 
# fviz_pca_biplot(diff.pca, axes = c(1,3),repel = TRUE, label = c("ind"), invisible = "var", col.ind = pca.key$day, title = "PC1 ~ PC3, color=day sampled")
# 
# # PCA with samples + top 10 genes contributing to PC scores 
# fviz_pca_biplot(diff.pca, axes = c(1,3),repel = TRUE, label = c("ind", "var"), select.var = list(contrib = 10)) 

Plot PC scores for axis 2 ~ axis 3, by treatment

# pca.key <- key[order(match(key$sample_stage, rownames(diff.pca$ind$coord))), ] #create key with samples ordered by same order as they are in the PCA 
# 
# # PCA plots with samples, color coded by treatments 
# fviz_pca_biplot(diff.pca, axes = c(2,3), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$infection_status, title = "PC2 ~ PC3, color=infection status") 
# fviz_pca_biplot(diff.pca, axes = c(2,3),repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$temperature, title = "PC2 ~ PC3, color=temperature") 
# fviz_pca_biplot(diff.pca, axes = c(2,3),repel = TRUE, label = c("ind"), invisible = "var", col.ind = pca.key$day, title = "PC2 ~ PC3, color=day sampled")
# 
# # PCA with samples + top 10 genes contributing to PC scores 
# fviz_pca_biplot(diff.pca, axes = c(2,3),repel = TRUE, label = c("ind", "var"), select.var = list(contrib = 10)) 

Extract PC data for genes, showing their relative contributions to PC PC Axis 1 ~ Axis 2

# diff.pca.gene.data <- fviz_pca_var(diff.pca, axes = c(1,2), select.var = list(contrib = 100))$data 
# head(diff.pca.gene.data)

Extract PC scores for each sample - are they distributed normally in multivarite space?

# pca.sample.scores <- diff.pca$ind
# hist(unlist(as.data.frame(pca.sample.scores)[1:3], use.names=FALSE), main="Histogram of PC scores for dimensions 1, 2 & 3") #should have normal distribution for multivariate normality 

Prepare data for downtream statistics (if desired)

The following creates a master dataframe from the differentially expressed genes with sample id, treatment info, and counts, in long format

Note: instead of data.frame(t(diffex.all.counts)) %>% you could easily swap in data.frame(t(counts(dds.larvae))) %>% go generate the same dataframe, but with all genes (not just differentially expressed ones)

# counts4stats <- key %>% 
#   mutate(sample_stage=as.character(sample_stage)) %>%
#   right_join(
#     data.frame(t(diffex.all.counts)) %>% 
#   rownames_to_column("sample_stage")
#   ) %>% 
#   pivot_longer(cols=starts_with("TRINITY"), values_to = "count", names_to = "gene")
# head(counts4stats)

Even more visualization

MA-plots

In DESeq2, the function plotMA shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet. Points will be colored red if the padj<0.05. Points which fall out of the window are plotted as open triangles pointing either up or down.

# plotMA(res.all.status, main="DEG by infection status\nLog2 fold change ~ mean of normalized counts")
# 
# plotMA(res.all.ColdVSWarm, main="DEGs between Cold & Warm\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.AmbVSWarm, main="DEGs between Ambient & Warm\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.ColdVSAmb, main="DEGs between Ambient & Cold\nLog2 fold change ~ mean of normalized counts")
# 
# plotMA(res.all.9vs26, main="DEGs between Day 9 & Day 26\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.9vs12, main="DEGs between Day 9 & Day 12\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.12vs26, main="DEGs between Day 12 & Day 26\nLog2 fold change ~ mean of normalized counts")

You can also make MA-plots with transformations

Shrinkage of effect size (LFC estimates) is useful for visualization and ranking of genes. To shrink the LFC, we pass the dds object to the function lfcShrink. Below we specify to use the apeglm method for effect size shrinkage (Zhu, Ibrahim, and Love 2018), which improves on the previous estimator.

We provide the dds object and the name or number of the coefficient we want to shrink, where the number refers to the order of the coefficient as it appears in resultsNames(dds).

Here we apply lfcShrink() function to the dds.larvae.DESeq object, and specify coef=2 to specify that we want to examine DEGs between infection status by each shrinkage method.

# resultsNames(dds.larvae.DESeq) # Check the order of coefficients to use in the lfcShrink function.
# # use `coef=2` to refers to infection status 
# 
# # Generate MA-plots after the different effect size shrinkage methods 
# par(mfrow=c(1,3), mar=c(4,4,2,1))
# plotMA(lfcShrink(dds.larvae.DESeq, coef=2, type="apeglm"),  main="apeglm")
# plotMA(lfcShrink(dds.larvae.DESeq, coef=2, type="normal"), main="normal")
# plotMA(lfcShrink(dds.larvae.DESeq, coef=2, type="ashr"), main="ashr")